library("dplyr")
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
set.seed(1122)
adult_train<- read.csv('adult-train.csv',
header = TRUE)
adult_test<- read.csv('adult-test.csv', header = TRUE)
filterTrainData <- which(adult_train$occupation == "?" | adult_train$native_country == "?" | adult_train$workclass == "?")
adult_train.df <- adult_train %>% filter(row_number() %in% filterTrainData)
filterTestData <- which(adult_test$occupation == "?" | adult_test$native_country == "?" | adult_test$workclass == "?")
adult_test.df <- adult_test %>% filter(!row_number() %in% filterTestData)
adult_train.df
adult_test.df
library(rpart)
library(rpart.plot)
model <- rpart(income ~ ., data=adult_train, method="class")
print(model)
## n= 32560
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 32560 7841 <=50K (0.75918305 0.24081695)
## 2) relationship=Not-in-family,Other-relative,Own-child,Unmarried 17799 1178 <=50K (0.93381651 0.06618349)
## 4) capital_gain< 7073.5 17481 872 <=50K (0.95011727 0.04988273) *
## 5) capital_gain>=7073.5 318 12 >50K (0.03773585 0.96226415) *
## 3) relationship=Husband,Wife 14761 6663 <=50K (0.54860782 0.45139218)
## 6) education=10th,11th,12th,1st-4th,5th-6th,7th-8th,9th,Assoc-acdm,Assoc-voc,HS-grad,Preschool,Some-college 10329 3456 <=50K (0.66540807 0.33459193)
## 12) capital_gain< 5095.5 9807 2944 <=50K (0.69980626 0.30019374) *
## 13) capital_gain>=5095.5 522 10 >50K (0.01915709 0.98084291) *
## 7) education=Bachelors,Doctorate,Masters,Prof-school 4432 1225 >50K (0.27639892 0.72360108) *
rpart.plot(model, extra=104, fallen.leaves=T, type=4, main="Income Train Dataset Decision Tree")
print(model$variable.importance[1:3])
## relationship marital_status capital_gain
## 2394.689 2356.078 1031.296
paste("The top three important predictors in the model are: relationship, marital_status, capital_gain")
## [1] "The top three important predictors in the model are: relationship, marital_status, capital_gain"
# The first split is done on relationship. The predicted class of the first node is "<=50K".
# The distribution of observations between the “<=50K” and “>50K” classes at first node are 22651 and 7510
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
predict.df <- predict(model, adult_test, type="class")
confusionmat_table <- table(predict.df, adult_test$income)
confusionmat <- confusionMatrix(predict.df, as.factor(adult_test$income))
confusionmat
## Confusion Matrix and Statistics
##
## Reference
## Prediction <=50K >50K
## <=50K 11805 1901
## >50K 630 1945
##
## Accuracy : 0.8445
## 95% CI : (0.8389, 0.8501)
## No Information Rate : 0.7638
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5137
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9493
## Specificity : 0.5057
## Pos Pred Value : 0.8613
## Neg Pred Value : 0.7553
## Prevalence : 0.7638
## Detection Rate : 0.7251
## Detection Prevalence : 0.8418
## Balanced Accuracy : 0.7275
##
## 'Positive' Class : <=50K
##
sensitivity.df <- sensitivity(confusionmat_table)
specificity.df <- specificity(confusionmat_table)
balanced.df <- ( sensitivity.df + specificity.df )/2
paste("The balanced accuracy is: ", round((balanced.df), digits = 3))
## [1] "The balanced accuracy is: 0.728"
balanced_errrate.df <- (1 - balanced.df)
paste("The balanced error rate:", round((balanced_errrate.df), digits=3))
## [1] "The balanced error rate: 0.272"
# Balanced accuracy is a metric we can use to assess the performance of a classification model.
# It is calculated as Balanced accuracy = (Sensitivity + Specificity) / 2
# Sensitivity: The “true positive rate” – the percentage of positive cases the model is able to detect
# Specificity: The “true negative rate” – the percentage of negative cases the model is able to detect
# This metric is particularly useful when the two classes are imbalanced – that is, one class appears much more than the other
library(ROCR)
# AUC - AUC stands for "Area under the ROC Curve." That is, AUC measures the entire two-dimensional area underneath the entire ROC curve from (0,0) to (1,1).
# ROC CURVE - ROC (Receiver Operator Characteristic Curve) can help in deciding the best threshold value. A ROC curve is plotted with FPR on the X-axis and TPR on the y-axis.
pred.rocr <- predict(model, newdata=adult_test, type="prob")[,2]
roc.pred <- prediction(pred.rocr, adult_test$income)
roc.perf <- performance(roc.pred, "tpr", "fpr")
auc <- performance(roc.pred, measure = "auc")
paste("AUC: ", round((auc@y.values[[1]]), digits = 3))
## [1] "AUC: 0.845"
plot(roc.perf, colorize=T, lwd=3)
abline(0,1)
### Part 2-1-D
base_acc <- mean(predict.df == adult_test$income)
printcp(model)
##
## Classification tree:
## rpart(formula = income ~ ., data = adult_train, method = "class")
##
## Variables actually used in tree construction:
## [1] capital_gain education relationship
##
## Root node error: 7841/32560 = 0.24082
##
## n= 32560
##
## CP nsplit rel error xerror xstd
## 1 0.126387 0 1.00000 1.00000 0.0098398
## 2 0.064022 2 0.74723 0.74723 0.0088402
## 3 0.037495 3 0.68320 0.68320 0.0085321
## 4 0.010000 4 0.64571 0.64571 0.0083394
plotcp(model)
paste('The optimal CP value is', model$cptable[which.min(model$cptable[,"xerror"])])
## [1] "The optimal CP value is 0.01"
model_prune <- prune(model, cp=0.01)
predict_prune <- predict(model_prune, adult_test, type="class")
prune_acc <- mean(predict_prune == adult_test$income)
prune_data <- data.frame(base_acc, prune_acc)
prune_data
# The accuracy of the model on the test data is equal when the tree is pruned, so there won't be any benefit of pruning on this model.
set.seed(1122)
train_table <- table(adult_train$income)
# By looking at the summary of the model the root node has 75.11% of training data as "<=50k" and 24.899% of training data as ">50K"
paste("The total number of observations are in the class <=50K: ", train_table[1])
## [1] "The total number of observations are in the class <=50K: 24719"
paste("The total number of observations are in the class >50K: ", train_table[2])
## [1] "The total number of observations are in the class >50K: 7841"
set.seed(1122)
lincome_train <- which(adult_train$income == "<=50K")
gincome_train <- which(adult_train$income == ">50K")
sample <- sample(lincome_train, size=length(gincome_train))
new_adtrain <- adult_train[c(sample, gincome_train),]
new_adtrain
table(new_adtrain$income)
##
## <=50K >50K
## 7841 7841
library(rpart)
library(rpart.plot)
new_model <- rpart(income ~ ., data=new_adtrain, method="class")
new_predictor <- predict(new_model, adult_test, type="class")
new_confusion_mattable <- table(new_predictor, adult_test$income)
new_confusion_mat <- confusionMatrix(new_predictor, as.factor(adult_test$income))
new_confusion_mat
## Confusion Matrix and Statistics
##
## Reference
## Prediction <=50K >50K
## <=50K 9727 660
## >50K 2708 3186
##
## Accuracy : 0.7931
## 95% CI : (0.7868, 0.7993)
## No Information Rate : 0.7638
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5158
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.7822
## Specificity : 0.8284
## Pos Pred Value : 0.9365
## Neg Pred Value : 0.5405
## Prevalence : 0.7638
## Detection Rate : 0.5974
## Detection Prevalence : 0.6380
## Balanced Accuracy : 0.8053
##
## 'Positive' Class : <=50K
##
new_sensitivity <- sensitivity(new_confusion_mattable)
new_specificity <- specificity(new_confusion_mattable)
new_balanced <- ( new_sensitivity + new_specificity )/2
paste("The balanced accuracy is: ", round((new_balanced), digits = 3))
## [1] "The balanced accuracy is: 0.805"
new_balanced_error_rate <- (1 - new_balanced)
paste("The balanced error rate:", round((new_balanced_error_rate), digits=3))
## [1] "The balanced error rate: 0.195"
# Balanced accuracy is a metric we can use to assess the performance of a classification model
# It is calculated as: Balanced accuracy = (Sensitivity + Specificity) / 2
# Sensitivity: The “true positive rate” – the percentage of positive cases the model is able to detect
# Specificity: The “true negative rate” – the percentage of negative cases the model is able to detect
# This metric is particularly useful when the two classes are imbalanced – that is, one class appears much more than the other
# AUC - AUC stands for "Area under the ROC Curve." That is, AUC measures the entire two-dimensional area underneath the entire ROC curve from (0,0) to (1,1).
# ROC CURVE - ROC (Receiver Operator Characteristic Curve) can help in deciding the best threshold value. A ROC curve is plotted with FPR on the X-axis and TPR on the y-axis.
new_pred.rocr <- predict(new_model, newdata=adult_test.df, type="prob")[,2]
new_roc.pred <- prediction(new_pred.rocr, adult_test.df$income)
new_roc.perf <- performance(new_roc.pred, "tpr", "fpr")
new_auc <- performance(new_roc.pred, measure = "auc")
paste("AUC: ", round((new_auc@y.values[[1]]), digits = 3))
## [1] "AUC: 0.845"
plot(new_roc.perf, colorize=T, lwd=3)
abline(0,1)
# The balanced accuracy, sensitivity, specificity, positive predictive value and AUC of the model used in 2.1 (c)
# Sensitivity : 0.949
# Specificity : 0.505
# Positive Predictive Value : 0.861
# Balanced Accuracy : 0.727
# AUC : 0.843
# Balanced Error Rate : 0.275
# The balanced accuracy, sensitivity, specificity, positive predictive value and AUC of the model used in 2.1 (e)
# Sensitivity : 0.782
# Specificity : 0.828
# Positive Predictive Value : 0.936
# Balanced Accuracy : 0.805
# AUC : 0.815
# Balanced Error Rate : 0.195
# sensitivity and specificity will always be inversely related (i.e., one increases as the other decreases). With the balanced data using under sampling on the model (e) the specificity has been increased whereas sensitivity has been decreased, but the overall balanced accuracy has been increased, ppv value increased.